capture program drop reclink2
program define reclink2
version 8.2

*! N. Wasi modified M.Blasnik's reclink program
*! last modified August 12,2014: add label to variables from using data
*! May 19,2014: fix bugs at orblock and required loops. 
*! In an earlier version, the program may crash when some required vars are missing
*! March 28, 2014
*! - add option manytoone, npairs
*! - prevent various macro problems 
*!   caused by single quote, double quote,or an unmatched parenthese in a string
*! - add that the required block has to be conditional on the variable is not missing

*! v 1.7 14-Jan-2010 M. Blasnik:  Record Linkage 
*! 1.7 fixes typo in winkler adjustment to bigram calculation that prevented the adjustment
*! 1.6 fixes file paths with spaces -- encloses all file references in double quotes
*! 1.5 fixes bug introduced in second release and orblock none bug
syntax varlist using/ ,  IDMaster(str) IDUsing(str) Gen(str) [WMatch(str) WNOMatch(str) EXClude(str) /* 
*/   UVarlist(str) MINScore(real 0.6) MINBigram(real 0.6) ORBLock(str) REQuired(str) /*
*/ _Merge(str) UPRefix(str) EXActstr(str) DEBug(str) MANYtoone NPairs(int 1)]

if "`_merge'"=="" local _merge "_merge"
cap confirm variable `_merge'
if _rc==0 {
	di as err "`_merge' already exists"
	exit 198
}

local nvars: word count `varlist'
if "`wmatch'"=="" local wmatch: di _dup(`nvars')  " 1 "
if "`wnomatch'"=="" local wnomatch "`wmatch'"
if "`uprefix'"=="" local uprefix "U"
foreach val in `wmatch' `wmnomatch'  {
	if `val'<1 {
		di as error " weights must be >=1"
		exit 198
	}
}

local nwms:  word count `wmatch'
local nwnms: word count `wnomatch'
if `nvars'!=`nwnms' | `nvars'!=`nwms'  {
	di as error "wmatch and wnomatch must each have the same # of elements as varlist"
	exit 198
}

if "`uvarlist'"=="" local uvarlist "`varlist'"
local nuvars: word count `uvarlist'
if `nvars'!=`nuvars' {
	di as error "uvarlist must have the same # of elements as varlist"
	exit 198
}

if `:list required in varlist'==0 {
		di as error "all variables in required() option must also be in the main varlist"
		exit 198
}
	
if "`orblock'"!="none" & `:list orblock in varlist'==0 {
		di as error "all variables in orblock() option must also be in the main varlist"
		exit 198
}

if "`orblock'"=="" & `nvars'>3 local orblock "`varlist'"

local i=1
foreach var of local varlist {
	local v`i' "`var'"
  local wm`i': word `i' of `wmatch'
	local wnm`i': word `i' of `wnomatch'
	local uv`i': word `i' of `uvarlist'
	cap confirm string var `var'
	local string`i'=(_rc==0)
	local bigram`i': list var in exactstr
	local bigram`i'=1-`bigram`i''
	local inreq`i': list var in required
	local i=`i'+1
}

* scaling factor for bigram score mapping to matchscore 
local biscale=.5/(1-`minbigram')^(1/3)

preserve

quietly {
isid `idmaster'
gen `gen'=0
label var `gen' "reclink matching score"
local mobs=c(N)
tempfile m u matched  idusingmatched idmastermatched 
if "`debug'"=="" tempfile results
else local results "`debug'"

keep `idmaster' `gen' `varlist'
sort `idmaster'
local idmtype: type `idmaster'
save "`m'"

if "`exclude'"!="" {
	tempfile midexclude uidexclude using2
	tempvar mrgtmp
	use `idmaster' `idusing' using "`exclude'"
	drop if missing(`idmaster') | missing(`idusing')
	save "`midexclude'"
	
	bysort `idusing': keep if _n==1
	keep `idusing'
	sort `idusing'
	save "`uidexclude'"
	
	use "`midexclude'"
	bysort `idmaster': keep if _n==1
	keep `idmaster'
	sort `idmaster'
	merge `idmaster' using "`m'", _merge(`mrgtmp')
	keep if `mrgtmp'==2
	drop `mrgtmp'	
	sort `idmaster'
	local mobs2match=c(N)
	save "`m'", replace

	use "`using'"
	local uobs=c(N)
	sort `idusing'
	merge `idusing' using "`uidexclude'", _merge(`mrgtmp') nokeep
	keep if `mrgtmp'==1
	drop `mrgtmp'
	save "`using2'"
}
else {
	local using2 "`using'"
}

use "`using2'"
local uobs2match=c(N)
if "`uobs'"=="" local uobs=c(N)
isid `idusing'
keep `idusing' `uvarlist'
local i=1
foreach var of varlist `uvarlist' {
	if "`var'"!="`v`i''" {
		cap rename `var' `v`i''
	}
	local i=`i'+1
}
sort `varlist'
local idutype: type `idusing'
tempvar uidtmp
egen `uidtmp'=group(`varlist'), missing
save "`u'"

* create lookup table back to original id variable in using dataset
keep `uidtmp' `idusing'
tempfile uids
sort `uidtmp'
save "`uids'"

* now collapse the using dataset down to unique combinations of varlist
use "`u'"
drop `idusing'
bysort `uidtmp' : keep if _n==1
sort `varlist' 
save "`u'", replace
local uniqusingn=c(N)

* identify perfect matches
use "`m'"
sort `varlist'
merge `varlist' using "`u'", nokeep
keep if _merge==3
local perfectn=c(N)
noi di as result _n "`perfectn' perfect matches found" _n


if `perfectn'>0 {
	replace `gen'=1
	keep `idmaster' `uidtmp' `gen'
	sort `idmaster'
	save "`matched'"

	* save using ids for perfect matches
	keep `uidtmp'  
	sort `uidtmp'
	save "`idusingmatched'"

	* save master ids for perfect matches	
	use "`matched'"
	keep `idmaster'
	sort `idmaster'
	save "`idmastermatched'"

	* get rid of perfect matches from using datafile 

	**---NW's added if block to allow for sampling with replacement from using dataset---**
	if "`manytoone'"=="" {
		use "`u'"
		sort `uidtmp'
		merge `uidtmp' using  "`idusingmatched'"
		keep if _merge==1
		drop _merge
		save "`u'", replace
	}
	**-----------------------------------------------------------------------------------**

		
	* get rid of perfect matches from master data file
	use "`m'"
	sort `idmaster'
	merge `idmaster' using  "`idmastermatched'"
	keep if _merge==1
	drop _merge
	save "`m'", replace	
}
	
else {
	use "`m'", replace
}

local tomatch=c(N)

tempname post
postfile `post' `idmtype' `idmaster' long `uidtmp' float `gen' using "`results'"
if `tomatch'>1000 {

noi di as text "Going through `tomatch' observation to assess fuzzy matches, each .=5% complete"
}

forvalues obs=1(1)`tomatch' {
  use "`m'" in `obs', replace
		
  if int(20*`obs'/`tomatch')!=int(20*(`obs'-1)/`tomatch') & (`tomatch'>1000) noi di "." _c
	local ifall
	local ifany
	local ifreq
* OR blocking
	local orblock : list orblock - required
	if lower("`orblock'")!="none" & "`orblock'"!="" {
		local i=1
		foreach v of local orblock {
			local thisval=`v'[1]
			
				**----NW's added lines to remove " or ` which will cause macro problems----**
				capture	if regexm(`"`thisval'"',`"""') ==1 local thisval = subinstr(`"`thisval'"',`"""',"",.)	
				capture if regexm("`thisval'","`") ==1 local thisval = subinstr("`thisval'","`","",.)	
				**--------------------------------------------------------------------------**	
				
				if "`thisval'"!="" {
					if `"`ifany'"'!="" {
					local ifany `" `ifany' | "'
					}
					cap confirm string var `v'
					
					if _rc==0 {
					local ifany `" `ifany' `v'=="`thisval'" "'										
					}
					else {
					local ifany `" `ifany' `v'==`thisval' "'
					}
				**-----------------NW's modified line--------------------**
				if `"`ifany'"' !="" local ifall `" if (`ifany') "'	
				//old command: local ifall `" if (`ifany') "'
				//this generates `" if ( ) "' for a missing string 
				**-------------------------------------------------------**
				} //end if thisval
		local i=`i'+1
		} // end foreach v	
	
	} /* end of OR blocking */
	
	
	* now add required matches
	if "`required'"!="" {
		local i=1
		foreach v of local required {
				
			local thisval=`v'[1]
			
				**----NW's added lines to remove " or ` which will cause macro problems----**
				capture	if regexm(`"`thisval'"',`"""') ==1 local thisval = subinstr(`"`thisval'"',`"""',"",.)	
				capture if regexm("`thisval'","`") ==1 local thisval = subinstr("`thisval'","`","",.)	
				**--------------------------------------------------------------------------**	
				
			**--NW's added if "`thisval'" != "" block to prevent blocking on missing values--**
			if "`thisval'"!="" {	
				if `"`ifreq'"'!="" {
					local ifreq `" `ifreq' & "'
				}
				cap confirm string var `v'
				if _rc==0 {
					local ifreq `" `ifreq' `v'=="`thisval'" "'				
				}
				else {
					local ifreq `" `ifreq' `v'==`thisval' "'
				}
			if `"`ifall'"'!="" local ifall `" `ifall' & (`ifreq') "'

			**-----------------------NW's modified line-------------------------------------**
			else if `"`ifreq'"'!=""  local ifall `" if (`ifreq') "'
			//old command: local ifall `" if (`ifreq') "'
			//the old command does not generate error without the if "`thisval'"!="" condition
			**----------------------------------------------------------------------------**
			
			} //end if thisval

		local i=`i'+1
		} // end foreach v

				
	} /* end of required matches */
		
	local thisidmaster=`idmaster'[1]
	local i=1
	foreach v of local varlist {
			local thisv`i'=`v'[1]
			local i=`i'+1
	}
		
* still need to go thru varlist to drop single quotes that cause macro problems?
//NW: hopefully now the problems are prevented.

	* load using data which meets required and orblock 
	
	use "`u'"  `ifall', replace
	local upossn=c(N)
	if `upossn'>0 {

	* scale bigram matches from minscore to 1 into 0->1
		tempvar M NM score bi
		gen `M'=0
		gen `NM'=0
		local i=1
		*v from using
		*thisv`i' from master
		foreach v of local varlist {
				
				**----NW's added lines to remove " or ` which will cause macro problems----**
				capture	if regexm(`"`thisv`i''"',`"""') ==1 local thisv`i' = subinstr(`"`thisv`i''"',`"""',"",.)	
				capture if regexm("`thisv`i''","`") ==1 local thisv`i' = subinstr("`thisv`i''","`","",.)	
				**-------------------------------------------------------------------------**					
				
				if "`thisv`i''"!="" {			
					if `inreq`i''!=1 {
						if `string`i''==1 & `bigram`i''==1 {							
							**---NW's modified lines to prevent a macro problem caused by ) or ( ---**
							bigram1 `v', match("`thisv`i''") gen(`bi')
							// old command: bigram1 `v', match(`thisv`i'') gen(`bi')
						    **----------------------------------------------------------------------**
							}						
						else {
							cap drop `bi'
							if `string`i''==1 gen `bi'=(`v'=="`thisv`i''")
							else gen `bi'=(`v'==`thisv`i'')
						}
					replace `M'=`M'+`wm`i''*(.5*`bi'^2+cond(`bi'>`minbigram',`biscale'*(`bi'-`minbigram')^(1/3),0))
					replace `NM'=`NM'+`wnm`i''*cond(`bi'<(`minbigram'-.2),1,cond(`bi'<`minbigram',1-`bi'^2,(1-`bi')^2)) if !missing(`v') 
					}
					else {
						replace `M'=`M'+`wm`i''
					}
				}
				* missing in one, not other = 30% non-match
				replace `NM'=`NM'+`wnm`i''*0.3 if (missing("`thisv`i''") + missing(`v')) ==1
			local i=`i'+1
		} 
		
		gen `score'=`M'/(`M'+`NM')		
		
		replace `score'=0 if `score'==.
		
		**-------NW's modified block to allow for output multiple candidates----------**
		if `npairs'==1 {
			sort `score'				
			* now keep the cases with the highest score and post them
			
			//NW: also a minor modification here, use >= instead of > so that 
			//records with zero score can be output if needed. otherwise, need to set minscore at negative value
			keep if `score'==`score'[_N] & `score'>=`minscore'			
			//old command: keep if `score'==`score'[_N] & `score'>`minscore'			
			local matches=c(N)
			if `matches'>0 {
				local scoreval=`score'[1]
				forval j=1(1)`matches' {
					local thisidusing = `uidtmp'[`j']
					post `post' (`thisidmaster') (`thisidusing') (`scoreval') 
				}
			}
		}
		else {			
			gsort -`score'			
			keep if `score' >= `minscore'			
			local Nobs_tmp = c(N)			
			//now drop some obs if Nobs_tmp> npairs
			if `Nobs_tmp' >`npairs' {
				local n_out = 1 
				local j = 1
				
				while `j' <= `Nobs_tmp' & (`n_out' < `npairs' | (`score'[`j'] == `score'[`j'+1])) {
					local n_out = `n_out'+1 
					local j = `j'+1
				}
				keep in 1/`n_out'
			}
			local matches=c(N)
			if `matches'>0 {
				forval j=1(1)`matches' {
					local scoreval=`score'[`j']
					local thisidusing = `uidtmp'[`j']
					post `post' (`thisidmaster') (`thisidusing') (`scoreval') 
				}
			}			
		} // end else where npairs!=1
		**------------------------------------------------------------------------------**		
	} 
}
postclose `post'

* now merge the matching back into master 
use "`results'", replace
local resobs=c(N)
if `perfectn'>0 {
	append using "`matched'"
}
tempvar tmrg
sort `uidtmp'
merge `uidtmp' using "`uids'", nokeep _merge(`tmrg')
assert `tmrg'==3
drop `tmrg' `uidtmp'
sort `idmaster'
save, replace

* clean up using data -- make sure no varnames in common so mismatches can be examined
use "`using'", replace
forvalues i=1(1)`nvars' {
	cap confirm var `v`i'' 
	if _rc==0 {
		rename `v`i'' `uprefix'`v`i'' 
		local labeltmp "`:var label `uprefix'`v`i'''"
		if "`labeltmp'"=="" label var `uprefix'`v`i'' "`v`i'' from using data"
		else { 
			if length("`labeltmp'")<=62 label var `uprefix'`v`i'' "`labeltmp' from using data"
			else {
				local labeltmp = substr("`labeltmp'",1,62)
				label var `uprefix'`v`i'' "`labeltmp'...from using data"
			}
		}	
	}
}
tempfile u2
sort `idusing'
save "`u2'"

* now back to original dataset and merge results into that, then using dataset into that
restore
sort `idmaster'
merge `idmaster' using "`results'", _merge(`tmrg')
assert `tmrg'!=2
sort `idusing'
merge `idusing' using "`u2'" , nokeep _merge(`_merge')

**---NW's added lines to sort the data by scores for multiple candidates---**
gsort `idmaster' -`gen'
**-------------------------------------------------------------------------**

local fobs=c(N)
qui count if missing(`idusing')
local nomatch=r(N)
forvalues i=1(1)`nvars' {
	cap move  `uprefix'`v`i'' `v`i''  
	cap move  `v`i'' `uprefix'`v`i''   
}
} /* end of quietly block */

format `gen' %5.4f
noi di _n "Added: `idusing'= identifier from `using'   `gen' = matching score"
noi di as res "Observations:  Master N = `mobs'    `using' N= `uobs' "
if "`exclude'"!="" {
	noi di as res "  # Obs after excluding matches in `exclude':  Master = `mobs2match'    `using': `uobs2match' "
}
noi di as res "  Unique Master Cases: matched = " `mobs'-`nomatch'   " (exact = `perfectn'), unmatched = `nomatch'"
end

**---------------------end main program------------------------------------**

capture program drop bigram1
program define bigram1
version 8.2
*! version 2.2 M Blasnik 19-Sep-2005
syntax varlist (max=1) , gen(str) match(str) 
di "match from bigram = " "`match'"
tempvar slen wink
cap confirm var `gen'
if _rc==0 replace `gen'=0
else gen `gen'=0
local mlen=length("`match'")
gen `slen'=length(`varlist')
local poss=`mlen'-1
if `mlen'>2 {
	forval i=1(1)`poss' {
			qui replace `gen'=`gen'+1 if index(`varlist',substr("`match'",`i',2))>0
	}
	qui replace `gen'=`gen'*2/(`slen'+`mlen'-2)
}

* deal with strings <3 characters
	qui replace `gen'=0 if (`mlen'<3 | `slen'<3) & !index("`match'",`varlist') & !index(`varlist',"`match'") 
  qui replace `gen'=min(`mlen',`slen')/(`mlen'+`slen'-1)  if (`mlen'<3 | `slen'<3) & ( index("`match'",`varlist') | index(`varlist',"`match'") )  	

	* Winkler adjustment: adjusts score upward based on first 1,2,3, or 4 characters matching
	gen byte `wink'=0
	forval i=1(1)4 {
			qui replace `wink'=`wink'+1 if substr(`varlist',1,`i')==substr("`match'",1,`i')
	}
	qui replace `gen'=`gen'+`wink'*(1-`gen')/10
  * make sure exact matches =1 and missing =0 
  qui replace `gen'=1 if `varlist'=="`match'"
	qui replace `gen'=0 if `slen'==0
end
